unzip("./activity.zip")
activity <- read.csv("activity.csv")
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
activity$date <- as.Date(activity$date)
For this part of the assignment, you can ignore the missing values in the dataset.
library(dplyr)
library(ggplot2)
stepsByDay <- tapply(activity$steps, activity$date, sum, na.rm=FALSE)
qplot(stepsByDay, xlab='Total steps per day', ylab='Frequency', binwidth=1000)
## Warning: Removed 8 rows containing non-finite values (stat_bin).
summary(stepsByDay)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 41 8841 10765 10766 13294 21194 8
interval.activity <- group_by(activity, interval)
interval.stats<- summarise(interval.activity, mean.steps = mean(steps, na.rm = TRUE))
library(plotly)
plot4 <- ggplot(interval.stats, aes(interval, mean.steps)) + geom_line() +
ggtitle("Average number of steps taken by 5-min interval, across all days") +
xlab("5-min interval") + ylab("average number of steps") + theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0.5),
axis.text.x = element_text(angle = 0, size = 10, hjust = 1),
axis.text.y = element_text(angle = 2, size = 10, hjust = 1))
plot4 <- ggplotly(plot4)
plot4
interval.stats[interval.stats$mean.steps == max(interval.stats$mean.steps),]
## # A tibble: 1 x 2
## interval mean.steps
## <int> <dbl>
## 1 835 206.
sum(is.na(activity$steps))
## [1] 2304
sum(is.na(activity$date))
## [1] 0
sum(is.na(activity$interval))
## [1] 0
for (i in 1:length(activity$interval)){
if(is.na(activity$steps[i]) == TRUE){
activity$steps[i] <- interval.stats$mean.steps[match(activity$interval[i], interval.stats$interval)]
}
}
summary(activity)
## steps date interval
## Min. : 0.00 Min. :2012-10-01 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.:2012-10-16 1st Qu.: 588.8
## Median : 0.00 Median :2012-10-31 Median :1177.5
## Mean : 37.38 Mean :2012-10-31 Mean :1177.5
## 3rd Qu.: 27.00 3rd Qu.:2012-11-15 3rd Qu.:1766.2
## Max. :806.00 Max. :2012-11-30 Max. :2355.0
stepsByDay <- tapply(activity$steps, activity$date, sum, na.rm=TRUE)
qplot(stepsByDay, xlab='Total steps per day', ylab='Frequency', binwidth=1000)
summary(stepsByDay)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 41 9819 10766 10766 12811 21194
Do these values differ from the estimates from the first part of the assignment? What is the impact of imputing missing data on the estimates of the total daily number of steps?
Yes. Many of the 0 records have changed, and the number of steps seems to be more distributed across intervals. The y-axis of both histograms has changed significantly.
Use the dataset with the filled-in missing values for this part.
days <- weekdays(activity$date)
activity <- cbind(activity, days)
###
weekday <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
weekend <- c("Saturday", "Sunday")
class.day <- c(1:17568)
activity <- cbind(activity, class.day)
activity$class.day[activity$days %in% weekday] <- "weekday"
activity$class.day[activity$days %in% weekend] <- "weekend"
activity$class.day <- as.factor(activity$class.day)
activitytrend <- aggregate(steps ~ interval + class.day, data=activity, mean)
plot6 <- ggplot(activitytrend, aes(x = interval, y = steps, group = class.day)) +
geom_line(aes(color = class.day), lwd = 0.5) +
facet_wrap(~ class.day, nrow = 2) +
ggtitle("Week pattern: Average number of steps taken by 5-min interval") +
labs(x = "Hour on the clock", y = "Number of steps") +
theme(plot.title = element_text(size = 14, hjust = 0.5),
axis.text.x = element_text(size = 12, hjust = 0),
axis.text.y = element_text(size = 12, hjust = 0),
legend.position = "none")
plot6 <- ggplotly(plot6)
plot6